import re
import jieba
import json
import numpy as np
from sklearn.model_selection import train_test_split

def clean_text(text):
    """文本清洗函数"""
    # 移除特殊字符
    text = re.sub(r'[^\u4e00-\u9fff\s]', '', text).strip()
    # 分词处理
    text = ' '.join(jieba.cut(text))
    return text

def load_json_file(file_path):
    """加载JSON文件"""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json_file(data, file_path):
    """保存JSON文件"""
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def split_dataset(data, test_size=0.2, random_state=42):
    """划分训练集和测试集"""
    train_data, test_data = train_test_split(
        data,
        test_size=test_size,
        random_state=random_state,
        stratify=[item['label'] for item in data]
    )
    return train_data, test_data

def load_label_classes(file_path):
    """加载标签类别"""
    return np.load(file_path)